library(plyr)
## Warning: package 'plyr' was built under R version 3.1.3
library(maps)
## Warning: package 'maps' was built under R version 3.1.3
##
## # ATTENTION: maps v3.0 has an updated 'world' map. #
## # Many country borders and names have changed since 1990. #
## # Type '?world' or 'news(package="maps")'. See README_v3. #
##
##
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:plyr':
##
## ozone
#read in citibike data
citibike <- read.csv("201512-citibike-tripdata.csv")
#attach citibike
attach(citibike)
#take a look at the head
head(citibike)
## tripduration starttime stoptime start.station.id
## 1 475 12/1/2015 07:35:36 12/1/2015 07:43:32 72
## 2 684 12/1/2015 07:38:15 12/1/2015 07:49:39 72
## 3 1063 12/1/2015 07:44:49 12/1/2015 08:02:33 72
## 4 1075 12/1/2015 08:02:29 12/1/2015 08:20:24 72
## 5 293 12/1/2015 08:06:37 12/1/2015 08:11:30 72
## 6 812 12/1/2015 08:07:57 12/1/2015 08:21:30 72
## start.station.name start.station.latitude start.station.longitude
## 1 W 52 St & 11 Ave 40.76727 -73.99393
## 2 W 52 St & 11 Ave 40.76727 -73.99393
## 3 W 52 St & 11 Ave 40.76727 -73.99393
## 4 W 52 St & 11 Ave 40.76727 -73.99393
## 5 W 52 St & 11 Ave 40.76727 -73.99393
## 6 W 52 St & 11 Ave 40.76727 -73.99393
## end.station.id end.station.name end.station.latitude
## 1 173 Broadway & W 49 St 40.76065
## 2 520 W 52 St & 5 Ave 40.75992
## 3 358 Christopher St & Greenwich St 40.73292
## 4 505 6 Ave & W 33 St 40.74901
## 5 525 W 34 St & 11 Ave 40.75594
## 6 484 W 44 St & 5 Ave 40.75500
## end.station.longitude bikeid usertype birth.year gender
## 1 -73.98443 22780 Subscriber 1983 1
## 2 -73.97649 17787 Subscriber 1975 1
## 3 -74.00711 18797 Subscriber 1966 1
## 4 -73.98848 14625 Subscriber 1985 1
## 5 -74.00212 21238 Subscriber 1968 1
## 6 -73.98014 19518 Subscriber 1960 2
#build a table for bike station id to longitude/latitude
uniqueStation <- citibike[!duplicated(citibike$start.station.id),c("start.station.id", "start.station.latitude", "start.station.longitude", "start.station.name")]
colnames(uniqueStation) <- c("station_id", "latitude", "longitude", "station_name")
#number of stations being used in December 2015
nrow(uniqueStation)
## [1] 471
In effort to identify the bike stations that are candidates for improving bike storage capacity, we decided to investigate the difference between bike departure and bike arrivals for each station.
#convert station id to factor
start.station.id <- as.factor(start.station.id)
end.station.id <- as.factor(end.station.id)
#Count start and end frequencies by station id
numStartStation <- count(start.station.id)
colnames(numStartStation) <- c("station_id", "start_freq")
numEndStation <- count(end.station.id)
colnames(numEndStation) <- c("station_id", "end_freq")
stationCount <- merge(numStartStation, numEndStation, sort = FALSE, by = "station_id")
stationCount$dep_minus_arr <- stationCount$start_freq - stationCount$end_freq
stationCount <- arrange(stationCount, stationCount$dep_minus_arr)
#get top 10 stations that have more arrivals than departures
mostArrivalStations <- head(stationCount, 10)
mostArrivalStations
## station_id start_freq end_freq dep_minus_arr
## 1 415 1458 2261 -803
## 2 477 4497 5250 -753
## 3 324 986 1663 -677
## 4 497 6105 6760 -655
## 5 492 3978 4586 -608
## 6 382 3968 4529 -561
## 7 514 2833 3379 -546
## 8 147 3054 3470 -416
## 9 368 4757 5119 -362
## 10 348 2790 3143 -353
#get top 10 stations that have more departures than arrivals
mostDepartureStations <- tail(stationCount, 10)
mostDepartureStations
## station_id start_freq end_freq dep_minus_arr
## 461 507 3143 2833 310
## 462 447 2857 2517 340
## 463 362 1977 1631 346
## 464 468 3261 2871 390
## 465 356 1664 1189 475
## 466 432 3620 3029 591
## 467 511 3492 2750 742
## 468 517 4165 3411 754
## 469 3236 2097 706 1391
## 470 3230 3937 2069 1868
longitude = c()
latitude = c()
for (i in 1:length(mostArrivalStations$station_id)){
longitude[i] = citibike$end.station.longitude[match(x = mostArrivalStations$station_id[i], table = citibike$end.station.id)]
latitude[i] = citibike$end.station.latitude[match(x = mostArrivalStations$station_id[i], table = citibike$end.station.id)]
}
mostArrivalStations$longitude = longitude
mostArrivalStations$latitude = latitude
longitude = c()
latitude = c()
for (i in 1:length(mostDepartureStations$station_id)){
longitude[i] = citibike$start.station.longitude[match(x = mostDepartureStations$station_id[i], table = citibike$start.station.id)]
latitude[i] = citibike$start.station.latitude[match(x = mostDepartureStations$station_id[i], table = citibike$start.station.id)]
}
mostDepartureStations$longitude = longitude
mostDepartureStations$latitude = latitude
#plot(numStartStation$freq ~ numStartStation$x)
To help visualize our findings, we will plot the stations with the most arrivals relative to departures as red points and the most departures relative to arrivals as blue points. Points with larger radius indicate a larger difference between departure and arrivals.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.1.3
nymap <- get_map(location = "New York", zoom = 12, color = 'bw')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
#map the stations with the most arrivals relative to departures
maxArrWeight <- max(-mostArrivalStations$dep_minus_arr)
mostArrAlpha <- abs(mostArrivalStations$dep_minus_arr)/maxArrWeight
maxDepWeight <- max(mostDepartureStations$dep_minus_arr)
mostDepAlpha <- mostDepartureStations$dep_minus_arr/maxDepWeight
mapPoints <- ggmap(nymap) + geom_point(aes(x = longitude, y = latitude), size = 6*mostArrAlpha, colour = 'red', data = mostArrivalStations) + geom_point(aes(x = longitude, y = latitude), size=6*mostDepAlpha, colour = 'blue', data = mostDepartureStations)
mapPoints
Based off of this data, we recommend that storage capacity be increased at the locations with blue dots; this is to keep a high inventory of bikes in order to accommodate departure demand when there is not enough influx of arrivals to match that demand. We also recommend that storage capacity secondly be increased for locations with red dots to accommodate the influx of arrivals. Alternatively, the citibike system can implement a system that regularly transports bikes from red locations to blue locations; this system of transport would then remove the need to invest in storage capacity increase at red locations, though the exact costs and benefits of implementing this system versus increasing storage capacity at red locations would require another analysis to determine.
Next, to address the increase in bike maintenance bills, we will first explore bike usage by bike ID. First we take a look at the total number of trips involving each bike.
attach(citibike)
## The following objects are masked _by_ .GlobalEnv:
##
## end.station.id, start.station.id
##
## The following objects are masked from citibike (pos = 5):
##
## bikeid, birth.year, end.station.id, end.station.latitude,
## end.station.longitude, end.station.name, gender,
## start.station.id, start.station.latitude,
## start.station.longitude, start.station.name, starttime,
## stoptime, tripduration, usertype
bikeCount <- count(bikeid)
#Number of unique bikes used over the month of December 2015
print(nrow(bikeCount))
## [1] 7248
#Analyze frequency
summary(bikeCount$freq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 71.0 95.0 110.9 127.0 359.0
#plot box plot of bike usage frequency without outliers
boxplot(bikeCount$freq, outline = FALSE, horizontal = TRUE, main = "Frequency of bike usage in December 2015", xlab = "Number of trips")
From the summary and boxplot above, we see that out of the 7248 bikes, the interquartile range of 56 trips is rather large with respect to the median of 95 trips. This cursory analysis suggests that not all bikes are used an equal amount.
However, not all trips have the same duration. So we chose to analyze each bike’s total trip duration throughout the month of December 2015 as well.
attach(citibike)
## The following objects are masked _by_ .GlobalEnv:
##
## end.station.id, start.station.id
##
## The following objects are masked from citibike (pos = 3):
##
## bikeid, birth.year, end.station.id, end.station.latitude,
## end.station.longitude, end.station.name, gender,
## start.station.id, start.station.latitude,
## start.station.longitude, start.station.name, starttime,
## stoptime, tripduration, usertype
##
## The following objects are masked from citibike (pos = 6):
##
## bikeid, birth.year, end.station.id, end.station.latitude,
## end.station.longitude, end.station.name, gender,
## start.station.id, start.station.latitude,
## start.station.longitude, start.station.name, starttime,
## stoptime, tripduration, usertype
bikeUsageById <- tapply(tripduration, start.station.id, sum, na.rm = TRUE)
bikeUsageById <- bikeUsageById/3600
summary(bikeUsageById)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.408 147.400 407.800 448.500 628.300 2868.000
boxplot(bikeUsageById, horizontal = TRUE, main = "Frequency of total bike duration in December 2015", xlab = "Accumulated Bike Duration (hours)")
From the graph above, it is easy to see that the frequency distribution is skewed to the right - meaning there are a few bikes with a much higher accumulated bike durations than the rest. Just by observing the box plot we can reasonably infer that some bikes are being used much more than other bikes. In fact the standard deviation of the total accumulated duration per bike is 367.2511044, which is significantly higher than 0, which is what we expect if all bikes truly had equal usage.
Since we have even more evidence to suggest the uneven distribution of bike usage, we will investigate bike stations and routes to see which stations/routes tend to have the longest trip durations.
First, we want to analyze the trip duration data for stations originating the longest rides. We perform a cursory data exploration as follows:
#get of trips duration in minutes
dur <- citibike$tripduration/60
dur <- sort(dur)
summary(dur)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.02 5.92 9.50 15.76 15.68 48880.00
boxplot(dur, horizontal = TRUE, xlab = "trip duration (minutes)", main = "Frequency analysis of bike trip duration in December 2015")
It is extremely apparent that there are a lot of outliers that do not make sense in the context of this data. For example, the maximum trip duration lasted 4.8881310^{4} minutes - which seems quite improbable given the context of intraday bike rentals.
Replotting the boxplot to remove the extreme outliers we see the following:
boxplot(dur, horizontal = TRUE, outline= FALSE, xlab = "trip duration (minutes)", main = "Frequency analysis of bike trip duration in December 2015")
In this chart we see that the upper end of the trip duration is around 30 minutes. However, trip durations that are less than 2 hours may very well be considered a “normal” bike trip so we will filter for bike trips that are no more than 2 hours in duration.
filteredDuration <- citibike[citibike$tripduration <= 120*60,]
To check that we did not remove too much essential data points, we see that out of 15 original trips, we have only removed 3670 trips for being considered outliers.
Now that we have filtered out some of the extreme tripduration data points, we will proceed to perform analysis on stations that originated the longest rides.
avgDurationByStation <- aggregate(x = filteredDuration$tripduration, by = list(filteredDuration$start.station.id), FUN = mean)
avgDurationByStation <- merge(x = avgDurationByStation, y = uniqueStation, by.x = 1, by.y = 1, sort = FALSE)
colnames(avgDurationByStation) <- c("station_id", "average_trip_duration", "latitude", "longitude", "station_name")
#sort by average duration
avgDurationByStation <- avgDurationByStation[order(avgDurationByStation$average_trip_duration),]
#shortest average duration bike routes
shortestBikeRoutes <- head(avgDurationByStation, 10)
shortestBikeRoutes
## station_id average_trip_duration latitude longitude
## 41 239 420.1669 40.69197 -73.98130
## 378 3097 435.0749 40.71922 -73.94245
## 353 3072 464.5509 40.70583 -73.94645
## 404 3124 467.4230 40.74731 -73.95451
## 367 3086 477.3735 40.71514 -73.94451
## 67 270 485.9845 40.69308 -73.97179
## 60 262 490.8465 40.69178 -73.97373
## 370 3089 496.2784 40.71732 -73.94820
## 309 2001 515.0972 40.69977 -73.97993
## 391 3111 527.7339 40.72585 -73.95065
## station_name
## 41 Willoughby St & Fleet St
## 378 N Henry St & Richardson St
## 353 Leonard St & Boerum St
## 404 46 Ave & 5 St
## 367 Graham Ave & Conselyea St
## 67 Adelphi St & Myrtle Ave
## 60 Washington Park
## 370 Leonard St & Meeker Ave
## 309 Sands St & Navy St
## 391 Norman Ave & Leonard St
#longest average duration bike routes
longestBikeRoutes <- tail(avgDurationByStation, 10)
longestBikeRoutes
## station_id average_trip_duration latitude longitude
## 175 398 1346.067 40.69165 -73.99998
## 414 3137 1391.467 40.77283 -73.96685
## 440 3165 1406.349 40.77579 -73.97621
## 413 3136 1430.622 40.76650 -73.97148
## 454 3180 1433.655 40.69878 -73.99712
## 74 281 1441.507 40.76440 -73.97371
## 435 3160 1528.404 40.77897 -73.97375
## 420 3143 1568.035 40.77683 -73.96389
## 314 2006 1742.980 40.76591 -73.97634
## 79 290 5069.000 40.76020 -73.96478
## station_name
## 175 Atlantic Ave & Furman St
## 414 5 Ave & E 73 St
## 440 Central Park West & W 72 St
## 413 5 Ave & E 63 St
## 454 Brooklyn Bridge Park - Pier 2
## 74 Grand Army Plaza & Central Park S
## 435 Central Park West & W 76 St
## 420 5 Ave & E 78 St
## 314 Central Park S & 6 Ave
## 79 2 Ave & E 58 St
#plot the shortest duration bike routes
nymap <- get_map(location = "New York", zoom = 12, color = 'bw')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
shortRouteMap <- ggmap(nymap) + geom_point(aes(x = longitude, y = latitude), colour = 'green', data = shortestBikeRoutes)
shortRouteMap
#plot the longest duration bike routes
longRouteMap <- ggmap(nymap) + geom_point(aes(x = longitude, y = latitude), colour = 'dark green', data = longestBikeRoutes)
longRouteMap
From the plot of shortest bike routes, it seems that the majority of stations are stationed around Brooklyn. The longest bike routes tend to originate in Manhattan stations.
SSandES = data.frame(citibike$start.station.id,citibike$end.station.id)
allroutes = count(SSandES)
allroutes = arrange(allroutes,desc(freq))
popularroutes = head(allroutes,20)
#append street name and long/lat info
popularroutes = merge(popularroutes, uniqueStation, by.x = 1, by.y = 1)
popularroutes = merge(popularroutes, uniqueStation, by.x = 2, by.y = 1)
colnames(popularroutes) <- c("end_id", "start_id", "freq", "start_latitude", "start_longitude", "start_station_name", "end_latitude", "end_longitude", "end_station_name")
nymap <- get_map(location = "New York", zoom = 12, color = 'bw')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
popularRouteMap <- ggmap(nymap)
for (i in 1:length(popularroutes)){
from = as.character(paste("New York,", popularroutes[i, "start_station_name"]))
to = as.character(paste("New York,", popularroutes[i, "end_station_name"]))
route_df = route(from, to, structure = "route")
popularRouteMap <- popularRouteMap + geom_path(
aes(x = lon, y = lat), color = i+2, size = 1.5,
data = route_df, lineend = "round"
)
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+Grand+Army+Plaza+&+Central+Park+S&destination=New+York,+Grand+Army+Plaza+&+Central+Park+S&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+E+7+St+&+Avenue+A&destination=New+York,+Lafayette+St+&+E+8+St&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+W+26+St+&+8+Ave&destination=New+York,+11+Ave+&+W+27+St&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+W+17+St+&+8+Ave&destination=New+York,+W+20+St+&+11+Ave&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+W+21+St+&+6+Ave&destination=New+York,+W+22+St+&+10+Ave&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+11+Ave+&+W+41+St&destination=New+York,+8+Ave+&+W+33+St&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+Pershing+Square+North&destination=New+York,+E+24+St+&+Park+Ave+S&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+Pershing+Square+North&destination=New+York,+W+33+St+&+7+Ave&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=New+York,+W+21+St+&+6+Ave&destination=New+York,+9+Ave+&+W+22+St&mode=driving&units=metric&alternatives=false&sensor=false
popularRouteMap
To minimize bike maintenance bills, we recommend the implementation of a system that regularly exchanges bikes between the stations that originate the longest rides with the stations that originate the shortest rides. Of course, this system would have its own costs, and a separate analysis would be needed to determine whether the pay-offs of the system are worth the costs.
After examining the most popular routes by bike in NYC, and found that these routes tend to be in lower-mid Manhattan as well as around parks and piers. We anticipate that the bikes following these routes will exhibit the most wear and require the most maintenance. To distribute the usage frequency, we recommend regularly refreshing and replacing the bikes commonly used at these “popular route” stations" with those bikes of stations along other routes.
First we identify the bike stations that have the least youthful attendance.
citibike$isYoung <- ifelse(test = is.na(citibike$birth.year), yes = FALSE, no = ((2016 - citibike$birth.year) <= 25))
stationYouthCount <- aggregate(x = citibike$isYoung, by = list(citibike$start.station.id), FUN = sum)
colnames(stationYouthCount) <- c("station_id", "count")
stationYouthCount <- stationYouthCount[order(stationYouthCount$count),]
#append longitude and latitude to data
longitude = c()
latitude = c()
for (i in 1:length(stationYouthCount$station_id)){
longitude[i] = citibike$start.station.longitude[match(x = stationYouthCount$station_id[i], table = citibike$start.station.id)]
latitude[i] = citibike$start.station.latitude[match(x = stationYouthCount$station_id[i], table = citibike$start.station.id)]
}
stationYouthCount$longitude = longitude
stationYouthCount$latitude = latitude
#get the top 20 bike stations with the least amount of youth
leastYouthfulStations <- head(stationYouthCount, n = 20)
leastYouthMap <- get_map(location = "New York", zoom = 12, color = 'bw')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
#map the locations of the least youthful stations
leastYouthfulPoints <- ggmap(leastYouthMap) + geom_point(aes(x = longitude, y = latitude), colour = 'dark green', data = leastYouthfulStations)
leastYouthfulPoints
Next we identify the stations less used by tourists
stationTourist <- aggregate(x = citibike$usertype == "Customer", by = list(citibike$start.station.id), FUN = sum)
colnames(stationTourist) <- c("station_id", "count")
stationTourist <- stationTourist[order(stationTourist$count),]
longitude = c()
latitude = c()
for (i in 1:length(stationTourist$station_id)){
longitude[i] = citibike$start.station.longitude[match(x = stationTourist$station_id[i], table = citibike$start.station.id)]
latitude[i] = citibike$start.station.latitude[match(x = stationTourist$station_id[i], table = citibike$start.station.id)]
}
stationTourist$longitude = longitude
stationTourist$latitude = latitude
#get the top 20 bike stations with least amount of tourists
stationLeastTuorist <- head(stationTourist, n = 20)
leastTouristMap <- get_map(location = "New York", zoom = 12, color = 'bw')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York&sensor=false
#map the locations of the least youthful stations
leastTouristPoints <- ggmap(leastTouristMap) + geom_point(aes(x = longitude, y = latitude), colour = 'dark blue', data = stationLeastTuorist)
leastTouristPoints
With these maps we have identified the stations least used by youth and by tourists. Promotion should be targeted at these marked locations.
In general, the distribution of bike demand will heavily depend on the time of day (and day of the week) investigated. We now will perform an analysis of bike rental frequency by time of day.
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.1.3
##
## Attaching package: 'lubridate'
##
## The following object is masked from 'package:plyr':
##
## here
bikeStartTime <- as.character(citibike$starttime)
bikeStartTime <- strptime(bikeStartTime, "%m/%d/%Y %H:%M:%S")
bikeStartByHour <- hour(bikeStartTime)
bikeByHourCnt <- count(bikeStartByHour)
colnames(bikeByHourCnt) <- c("Time of Day", "Number of Bike Rides")
barplot(bikeByHourCnt[,"Number of Bike Rides"], names.arg = bikeByHourCnt[,"Time of Day"], main = "Bike Rides by Time of Day", xlab = "Time of Day", ylab = "Number of Bike Rides", col = "light blue")
The result interesting because we see a bi-modal distribution that peaks during 8 a.m. - 9 a.m. in the morning 5 p.m. - 6 p.m. in the evening. This makes it pretty apparent that bikes are used most often to commute to and from work.
But what about bike usage by time during weekends? We decided to perform the same analysis above, but this time first partitioning the data set into weekends and weedays.
isWeekend <- bikeStartTime$wday >= 6
bikeStartWeekend <- bikeStartByHour[c(isWeekend)]
bikeStartWeekday <- bikeStartByHour[c(!isWeekend)]
d <- count(bikeStartWeekend)
bikeByHourWeekday <- count(bikeStartWeekday)
bikeByHourWeekend <- count(bikeStartWeekend)
colnames(bikeByHourWeekend) <- c("Time of Day", "Number of Bike Rides")
colnames(bikeByHourWeekday) <- c("Time of Day", "Number of Bike Rides")
barplot(bikeByHourWeekend[,"Number of Bike Rides"], names.arg = bikeByHourWeekend[,"Time of Day"], main = "Weekend: Bike Rides by Time of Day", xlab = "Time of Day", ylab = "Number of Bike Rides", col = "light green")
barplot(bikeByHourWeekday[,"Number of Bike Rides"], names.arg = bikeByHourWeekday[,"Time of Day"], main = "Weekday: Bike Rides by Time of Day", xlab = "Time of Day", ylab = "Number of Bike Rides", col = "violet")
The data seems to substantiate our claim that most bike rides are used to commute to and from work since we still observe the bi-modal distribution on the bike trip frequency graph over weekdays. However on weekends, the graph looks more uni-modal with the peak at around 1 p.m. (which is usually a time that people explore the city and hang out with friends).
With this information, it is possible to regulate bike traffic and also increase revenue. We can place a slight increase on prices of rented bikes during the peak hours; we expect demand to be relatively inelastic at these hours where customers are travelling to and from work, thereby increasing revenue per consumer. To distribute bike usage, we can lower prices during hours where usage dips; we expect this may draw some more elastic demand, such as luxury trips (as indicated by most popular routes) for scenic rides around the park and pier, thereby making bike maintenance more manageable.
We decided to also investigate bike trips conditioned by gender of the bikers.
undisclosedBikers <- citibike[citibike$gender == 0,]
maleBikers <- citibike[citibike$gender == 1, ]
femaleBikers <- citibike[citibike$gender == 2,]
genderCount <- c(nrow(undisclosedBikers), nrow(maleBikers), nrow(femaleBikers))
barplot(genderCount, names.arg = c("Undisclosed", "Male", "Female"), main = "Bike Trips by Gender", xlab = "Gender", ylab = "Number of Bike Trips", col = c(1,2,3))
From the data, we observe a very large gender asymmetry with respect to bike riders. We believe that this data highlights that Citibike’s current promotional strategies have not been successful in attracting female bikers. This may be an issue that Citibike should attempt to address in order to increase its overall user base.